import numpy as np
import pandas as pd
import eurostat_deaths as eurostat
from datetime import date, datetime as dt, timedelta
import plotly.express as px
import dtw
import plotly.io as pio
pio.renderers.default='notebook'
Importing the dtw module. When using in academic works please cite: T. Giorgino. Computing and Visualizing Dynamic Time Warping Alignments in R: The dtw Package. J. Stat. Soft., doi:10.18637/jss.v031.i07.
# get inputs
regions = pd.read_csv('./data/regions.csv')
#deaths = eurostat.deaths(regions = list(regions['NUTS3']),
# start = datetime.strptime('2015-01-01', '%Y-%m-%d').date())
#deaths.to_csv('./data/deaths.csv', index = False)
all_deaths = pd.read_csv('./data/deaths.csv', dtype={'age': 'string'})
population = eurostat.populations()
# utility function
def week_num_to_date(year, week_num):
if year == 2020:
return(dt.strptime(str(year) + str(week_num) + '-1', '%Y%W-%w') - timedelta(days = 7))
elif year == 2021:
return(dt.strptime(str(year) + str(week_num) + '-1', '%Y%W-%w'))
# filter out irrelevant regions
all_deaths = all_deaths[all_deaths['region'].isin(regions['NUTS3'])]
# compute weekly baseline mortality: average weekly deaths in years 2015-19
baseline_mort = all_deaths[ \
all_deaths['year'].isin([2015, 2016, 2017, 2018, 2019]) \
].groupby(['region', 'sex', 'age', 'week'] \
).aggregate({'deaths': 'mean'}).reset_index()
# get excess mortality
excess_mortality = pd.merge(all_deaths[all_deaths['year'] >= 2020], baseline_mort,
on = ['region', 'sex', 'age', 'week'], how = 'left',
suffixes=('_total', '_baseline'))
# add ISO week start date
excess_mortality['date'] = excess_mortality.apply(lambda r: week_num_to_date(r.year, r.week), axis = 1)
# filter out data after Jul 15, '21' -- looks bad -- check with Martin?
excess_mortality = excess_mortality[excess_mortality['date'] <= '2021-07-15']
excess_mortality['excess_deaths'] = excess_mortality['deaths_total'] - excess_mortality['deaths_baseline']
# set up the regions dataset
regions = pd.read_csv('./data/regions.csv')
regions.loc[regions['NUTS3'].isin(['SE214', 'SE322', 'SE221', 'SE212', 'SE213', 'SE321', \
'SE332', 'SE331', 'SE312', 'SE311', 'SE313', 'SE124', \
'SE122', 'SE125', 'CZ041', 'SE231', 'SE211', 'SE121']), 'cluster_1'] = 1
regions.loc[regions['NUTS3'].isin(['SE123', 'CZ051', 'CZ063', 'CZ053', 'CZ052', 'CZ032', \
'CZ031', 'CZ072', 'CZ071']), 'cluster_1'] = 2
regions.loc[regions['NUTS3'].isin(['CZ042', 'PL52', 'PL43', 'PL84', 'CZ064', 'PL72', \
'CZ080', 'CZ020', 'SE224', 'PL62', 'PL42', 'SE232']), 'cluster_1'] = 3
regions.loc[regions['NUTS3'].isin(['PL61', 'PL82', 'PL81', 'PL63', 'PL71', 'SE110']), 'cluster_1'] = 4
regions.loc[regions['NUTS3'].isin(['PL51', 'PL21', 'PL41', 'PL22', 'PL9']), 'cluster_1'] = 5
regions.loc[regions['NUTS3'] == 'CZ010', 'cluster_1'] = 6
regions['cluster_2'] = regions['cluster_1'] - 1
regions.loc[regions['cluster_2'] == 0, 'cluster_2'] = 1
regions = regions.rename(columns = {'NUTS3':'region'})
# get all the data together
mort_data_all = pd.merge(regions, excess_mortality, on = ['region'])
# collapse region and name into one column
mort_data_all['region'] = mort_data_all['region'] + ' - ' + mort_data_all['name']
mort_data_all.drop(['name'], axis = 1, inplace = True)
# function to plot curves
def plot_deaths(data, regions = [], age_groups = [], sex = 'T', metrics = ['deaths_total'],
aggregate_regions = False, aggregate_age_groups = False, aggregate_clusters = False,
plot_by_cluster = False, cluster_type = None, clusters = [],
plot_title = None, log_scale = False):
"""
for later: complete documentation
for now:
the parameter 'sex' must be provided
if plot_by_cluster = True, regions is ignored, but cluster_type and clusters must be provided and valid
if plot_by_cluster = False, regions must be provided and valid
"""
### for later: check that input parameters are acceptable ###
# initialise some variables to be used later in the function
aggregate_dict = {'population': 'sum'}
keep_metrics = []
if any([x in metrics for x in ['total_mortality_100K', 'deaths_total']]):
keep_metrics = keep_metrics + ['deaths_total']
aggregate_dict['deaths_total'] = 'sum'
if any([x in metrics for x in ['excess_mortality_100K', 'excess_deaths']]):
keep_metrics = keep_metrics + ['excess_deaths']
aggregate_dict['excess_deaths'] = 'sum'
metrics_to_drop = list(set(keep_metrics) - set(metrics))
# filter data
if len(age_groups) > 0:
data = data[data['age'].isin(age_groups)]
data = data[data['sex'] == sex]
# further filter and aggregate data based on whether plotting is to be done by region or cluster
if plot_by_cluster == False: # plot by region
# filter data
data = data[data['region'].isin(regions)]
data = data[['region', 'date', 'age', 'population'] + keep_metrics]
# aggregate data
if aggregate_regions == True and aggregate_age_groups == True:
data = data.groupby(['date']).aggregate(aggregate_dict).reset_index()
colour = None
elif aggregate_regions == True:
if len(age_groups) > 1:
data = data.groupby(['age', 'date']).aggregate(aggregate_dict).reset_index()
colour = 'age'
else:
data = data.groupby(['date']).aggregate(aggregate_dict).reset_index()
colour = None
elif aggregate_age_groups == True:
if len(regions) > 1:
data = data.groupby(['region', 'date']).aggregate(aggregate_dict).reset_index()
colour = 'region'
else:
data = data.groupby(['date']).aggregate(aggregate_dict).reset_index()
colour = None
else:
data['region_age'] = 'Region: ' + data['region'] + ', Age: ' + data['age']
data.drop(['age', 'region'], axis = 1, inplace = True)
colour = 'region_age'
elif cluster_type == 1: # plot by cluster 1
# filter data
if len(clusters) > 0:
data = data[data['cluster_1'].isin(clusters)]
data = data[['cluster_1', 'date', 'age', 'population'] + keep_metrics]
# aggregate data
if aggregate_clusters == True and aggregate_age_groups == True:
data = data.groupby(['date']).aggregate(aggregate_dict).reset_index()
colour = None
elif aggregate_clusters == True:
if len(age_groups) > 1:
data = data.groupby(['age', 'date']).aggregate(aggregate_dict).reset_index()
colour = 'age'
else:
data = data.groupby(['date']).aggregate(aggregate_dict).reset_index()
colour = None
elif aggregate_age_groups == True:
if len(regions) > 1:
data = data.groupby(['cluster_1', 'date']).aggregate(aggregate_dict).reset_index()
colour = 'cluster_1'
else:
data = data.groupby(['date']).aggregate(aggregate_dict).reset_index()
colour = None
else:
data = data.groupby(['cluster_1', 'age', 'date']).aggregate(aggregate_dict).reset_index()
data['cluster_age'] = 'Cluster: ' + data['cluster_1'].apply(lambda x: str(x)) + \
', Age: ' + data['age']
data.drop(['age', 'cluster_1'], axis = 1, inplace = True)
colour = 'cluster_age'
elif cluster_type == 2: # plot by cluster 2
# filter data
if len(clusters) > 0:
data = data[data['cluster_2'].isin(clusters)]
data = data[['cluster_2', 'date', 'age', 'population'] + keep_metrics]
# aggregate data
if aggregate_clusters == True and aggregate_age_groups == True:
data = data.groupby(['date']).aggregate(aggregate_dict).reset_index()
colour = None
elif aggregate_clusters == True:
if len(age_groups) > 1:
data = data.groupby(['age', 'date']).aggregate(aggregate_dict).reset_index()
colour = 'age'
else:
data = data.groupby(['date']).aggregate(aggregate_dict).reset_index()
colour = None
elif aggregate_age_groups == True:
if len(regions) > 1:
data = data.groupby(['cluster_2', 'date']).aggregate(aggregate_dict).reset_index()
colour = 'cluster_2'
else:
data = data.groupby(['date']).aggregate(aggregate_dict).reset_index()
colour = None
else:
data = data.groupby(['cluster_2', 'age', 'date']).aggregate(aggregate_dict).reset_index()
data['cluster_age'] = 'Cluster: ' + data['cluster_2'].apply(lambda x: str(x)) + \
', Age: ' + data['age']
data.drop(['age', 'cluster_2'], axis = 1, inplace = True)
colour = 'cluster_age'
# create 'derivative metrics' if needed
if 'total_mortality_100K' in metrics:
data['total_mortality_100K'] = data['deaths_total']/data['population']*10**5
if 'excess_mortality_100K' in metrics:
data['excess_mortality_100K'] = data['excess_deaths']/data['population']*10**5
data.drop(['population'], axis = 1, inplace = True)
# remove unncessary metrics
if len(metrics_to_drop) > 0:
data.drop(metrics_to_drop, axis = 1, inplace = True)
# sort data by date (plotly line won't automatically sort data)
data.sort_values(by = ['date', colour], inplace = True)
# reshape data if multiple metrics are required and then visualise; else just visualise without reshape
if len(metrics) > 1:
# reshape
data = pd.melt(data, id_vars = ['date'] + [colour], value_vars = metrics,
var_name='metric', value_name='metric_value')
data[colour + '_metric'] = data[colour] + ', Metric: ' + data['metric']
data.drop([colour, 'metric'], axis = 1, inplace = True)
colour = colour + '_metric'
if log_scale == True:
data['metric_value'] = np.log10(data['metric_value'])
# visualise
if plot_title != None:
if colour == None:
fig = px.line(data.dropna(), x='date', y='metric_value',
title=plot_title)
else:
fig = px.line(data.dropna(), x='date', y='metric_value',
color = colour, title=plot_title)
else:
if colour == None:
fig = px.line(data.dropna(), x='date', y='metric_value')
else:
fig = px.line(data.dropna(), x='date', y='metric_value',
color = colour)
else:
# visualise
if log_scale == True:
data[metrics] = np.log10(data[metrics])
if plot_title != None:
if colour == None:
fig = px.line(data.dropna(), x='date', y=metrics, title=plot_title)
else:
fig = px.line(data.dropna(), x='date', y=metrics,
color = colour, title=plot_title)
else:
if colour == None:
fig = px.line(data.dropna(), x='date', y=metrics)
else:
fig = px.line(data.dropna(), x='date', y=metrics, color = colour)
fig.show()
return
regions_list = [
list(mort_data_all['region'].unique()), #all
[x for x in list(mort_data_all['region'].unique()) if x.startswith('CZ', 0, 2)], #CZ
[x for x in list(mort_data_all['region'].unique()) if x.startswith('PL', 0, 2)], #PL
[x for x in list(mort_data_all['region'].unique()) if x.startswith('SE', 0, 2)] #SE
]
metrics_list = [
['total_mortality_100K'],
['excess_mortality_100K'],
['total_mortality_100K', 'excess_mortality_100K']
]
for regions_id, regions in enumerate(regions_list):
for metrics in metrics_list:
# initialise
log_scale = False
# set up plot titles
if regions_id == 0:
plot_title = "All Regions, Age Groups, and Sexes"
elif regions_id == 1:
plot_title = "All CZ Regions, Age Groups, and Sexes"
elif regions_id == 2:
plot_title = "All PL Regions, Age Groups, and All Sexes"
elif regions_id == 3:
plot_title = "All SE Regions, Age Groups, and All Sexes"
# add in metric names to plot titles
if len(metrics) > 1:
# log_scale = True ### commented out for now, figure later
# plot_title = plot_title + " - Total and Excess Mortality per 100K Capita (log_10 scale)"
plot_title = plot_title + " - Total and Excess Mortality per 100K Capita"
elif metrics[0] == 'total_mortality_100K':
plot_title = plot_title + " - Total Mortality per 100K Capita"
elif metrics[0] == 'excess_mortality_100K':
plot_title = plot_title + " - Excess Mortality per 100K Capita"
# generate plot
plot_deaths(mort_data_all, age_groups = ['TOTAL'], regions = regions,
metrics = metrics, plot_title = plot_title)
metrics_list = [
['total_mortality_100K'],
['excess_mortality_100K'],
['total_mortality_100K', 'excess_mortality_100K']
]
for metrics in metrics_list:
# initialise
log_scale = False
# set up plot titles
plot_title = "All Clusters, Age Groups, and Sexes"
# add in metric names to plot titles
if len(metrics) > 1:
# log_scale = True ### commented out for now, figure later
# plot_title = plot_title + " - Total and Excess Mortality per 100K Capita (log_10 scale)"
plot_title = plot_title + " - Total and Excess Mortality per 100K Capita"
elif metrics[0] == 'total_mortality_100K':
plot_title = plot_title + " - Total Mortality per 100K Capita"
elif metrics[0] == 'excess_mortality_100K':
plot_title = plot_title + " - Excess Mortality per 100K Capita"
# generate plot
plot_deaths(mort_data_all, age_groups = ['TOTAL'], plot_by_cluster = True,
metrics = metrics, plot_title = plot_title, cluster_type = 1)
metrics_list = [
['total_mortality_100K'],
['excess_mortality_100K'],
['total_mortality_100K', 'excess_mortality_100K']
]
for metrics in metrics_list:
# initialise
log_scale = False
# set up plot titles
plot_title = "All Clusters, Age Groups, and Sexes"
# add in metric names to plot titles
if len(metrics) > 1:
# log_scale = True ### commented out for now, figure later
# plot_title = plot_title + " - Total and Excess Mortality per 100K Capita (log_10 scale)"
plot_title = plot_title + " - Total and Excess Mortality per 100K Capita"
elif metrics[0] == 'total_mortality_100K':
plot_title = plot_title + " - Total Mortality per 100K Capita"
elif metrics[0] == 'excess_mortality_100K':
plot_title = plot_title + " - Excess Mortality per 100K Capita"
# generate plot
plot_deaths(mort_data_all, age_groups = ['TOTAL'], plot_by_cluster = True,
metrics = metrics, plot_title = plot_title, cluster_type = 2)
Four clusters of regions are created below:
# obtain distance matrices for clustering
### setup
mort_data_all['total_mortality_100K'] = mort_data_all['deaths_total']/mort_data_all['population']*10**5
mort_data_all['excess_mortality_100K'] = mort_data_all['excess_deaths']/mort_data_all['population']*10**5
### initialise
regions_series = mort_data_all['region'].unique()
dm1 = np.zeros([len(regions_series), len(regions_series)])
dm2 = np.zeros([len(regions_series), len(regions_series)])
dm1_dtw = np.zeros([len(regions_series), len(regions_series)])
dm2_dtw = np.zeros([len(regions_series), len(regions_series)])
### compute and store distances
for reg_id, reg in enumerate(regions_series):
time_series_1_dm1 = mort_data_all[(mort_data_all['region'] == reg) & \
pd.notnull(mort_data_all['total_mortality_100K'])] \
[['total_mortality_100K', 'date']].sort_values(by = 'date').set_index('date').transpose()
time_series_1_dm2 = mort_data_all[(mort_data_all['region'] == reg) & \
pd.notnull(mort_data_all['excess_mortality_100K'])] \
[['excess_mortality_100K', 'date']].sort_values(by = 'date').set_index('date').transpose()
### NOTE: transpose being done to facilitate date-wise difference calculations
for reg_id_2 in range(reg_id + 1, len(regions_series)):
time_series_2_dm1 = \
mort_data_all[mort_data_all['region'] == regions_series[reg_id_2]][['total_mortality_100K', 'date']] \
.sort_values(by = 'date').set_index('date').transpose()
time_series_2_dm2 = \
mort_data_all[mort_data_all['region'] == regions_series[reg_id_2]][['excess_mortality_100K', 'date']]\
.sort_values(by = 'date').set_index('date').transpose()
diff_1 = abs(time_series_1_dm1 - time_series_2_dm1).transpose()
diff_1 = diff_1[pd.notnull(diff_1['total_mortality_100K'])]
diff_2 = abs(time_series_1_dm2 - time_series_2_dm2).transpose()
diff_2 = diff_2[pd.notnull(diff_2['excess_mortality_100K'])]
### NOTE: transpose being done here (again) to exclude null values, which are
### generated if one time-series has no value for a given date, but the other does
dm1[reg_id, reg_id_2] = diff_1.max().item()
dm1[reg_id_2, reg_id] = dm1[reg_id, reg_id_2]
dm2[reg_id, reg_id_2] = diff_2.max().item()
dm2[reg_id_2, reg_id] = dm2[reg_id, reg_id_2]
dm1_dtw[reg_id, reg_id_2] = dtw.dtw(time_series_1_dm1, time_series_2_dm1).normalizedDistance
dm1_dtw[reg_id_2, reg_id] = dm1_dtw[reg_id, reg_id_2]
dm2_dtw[reg_id, reg_id_2] = dtw.dtw(time_series_1_dm2, time_series_2_dm2).normalizedDistance
dm2_dtw[reg_id_2, reg_id] = dm2_dtw[reg_id, reg_id_2]
print(str(reg_id + 1) + ' out of 51 regions done')
1 out of 51 regions done 2 out of 51 regions done 3 out of 51 regions done 4 out of 51 regions done 5 out of 51 regions done 6 out of 51 regions done 7 out of 51 regions done 8 out of 51 regions done 9 out of 51 regions done 10 out of 51 regions done 11 out of 51 regions done 12 out of 51 regions done 13 out of 51 regions done 14 out of 51 regions done 15 out of 51 regions done 16 out of 51 regions done 17 out of 51 regions done 18 out of 51 regions done 19 out of 51 regions done 20 out of 51 regions done 21 out of 51 regions done 22 out of 51 regions done 23 out of 51 regions done 24 out of 51 regions done 25 out of 51 regions done 26 out of 51 regions done 27 out of 51 regions done 28 out of 51 regions done 29 out of 51 regions done 30 out of 51 regions done 31 out of 51 regions done 32 out of 51 regions done 33 out of 51 regions done 34 out of 51 regions done 35 out of 51 regions done 36 out of 51 regions done 37 out of 51 regions done 38 out of 51 regions done 39 out of 51 regions done 40 out of 51 regions done 41 out of 51 regions done 42 out of 51 regions done 43 out of 51 regions done 44 out of 51 regions done 45 out of 51 regions done 46 out of 51 regions done 47 out of 51 regions done 48 out of 51 regions done 49 out of 51 regions done 50 out of 51 regions done 51 out of 51 regions done
# export data files to csv
pd.DataFrame(dm1, columns = regions_series).rename(dict(enumerate(regions_series))) \
.to_csv('./data/clustering_distance_datasets/total_mortality_100K_wo_dtw.csv')
pd.DataFrame(dm2, columns = regions_series).rename(dict(enumerate(regions_series))) \
.to_csv('./data/clustering_distance_datasets/excess_mortality_100K_wo_dtw.csv')
pd.DataFrame(dm1_dtw, columns = regions_series).rename(dict(enumerate(regions_series))) \
.to_csv('./data/clustering_distance_datasets/total_mortality_100K_w_dtw.csv')
pd.DataFrame(dm2_dtw, columns = regions_series).rename(dict(enumerate(regions_series))) \
.to_csv('./data/clustering_distance_datasets/excess_mortality_100K_w_dtw.csv')